The purpose of my project is to investigate the lower utilization of Hospice care by minority racial groups. There is a history of racial groups having lower levels of access to medical care and this is also true for hospice care. My goal is not to explain why or how this is occurring, that is the job of social and healthcare professionals. My goal is to collect and evaluate the data so that these professionals can direct their effort in the best way possible. the region of concern will be Louisiana and Mississippi.
2018 Medicare hospice claims.
In this notebook I develop the model for each state and classify them using the model from that state.
References
https://dataaspirant.com/random-forest-algorithm-machine-learing/
https://machinelearningmastery.com/implement-random-forest-scratch-python/
https://machinelearningmastery.com/random-forest-ensemble-in-python/
https://www.datacamp.com/community/tutorials/random-forests-classifier-python
https://www.datacamp.com/community/tutorials/random-forests-classifier-python#building
# packages
import pandas as pd
import numpy as np
from numpy.random import randint
import re
import csv
import os
import pickle
from io import StringIO
from matplotlib import pyplot as plt
import seaborn as sns
#format the work space
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 10]
# These are the dictionary descriptions for the categorical data
states = {1:'Alabama',2:'Alaska',3:'Arizona',4:'Arkansas',5:'California',6:'Colorado',7:'Connecticut',8:'Delaware',
9:'DistOCol',10:'Florida',11:'Georgia',12:'Hawaii',13:'Idaho',14:'Illinois',15:'Indiana',16:'Iowa',17:'Kansas',
18:'Kentucky',19:'Louisiana',20:'Maine',21:'Maryland',22:'Massachusetts',23:'Michigan',24:'Minnisota',
25:'Mississippi',26:'Missouri',27:'Montana',28:'Nebraska',29:'Nevada',30:'NewHampshire',31:'NewJersey',
32:'NewMexico',33:'NewYork',34:'NorthCarolina',35:'NorthDakota',36:'Ohio',37:'Oklahoma',38:'Oregon',39:'Pennsylvainia',
40:'PuertoRico',41:'RhodeIsland',42:'SouthCarolina',43:'SouthDakota',44:'Tennessee',45:'Texas',46:'Utah',47:'Vermont',
48:'VirginIsland',49:'Virginia',50:'Washington',51:'WestVirgina',52:'Wisconson',53:'Wyoming',54:'Africa',55:'Canada_Islands',
56:'CentAmerWstInd',57:'Europe',58:'Mexico',59:'Oceania',60:'Philippines',61:'SouthAmerica',62:'UsPosessions',
63:'AmericanSamoa',64:'Guam',65:'SaipanNorthMarianas',97:'NorthernMarianas',98:'Guam',99:'Unknown'}
Race = {0:'Unknown', 1:'White',2:'Black',3:'Other',4:'Asian',5:'Hispanic',6:'North_American_Native'}
Race_WBAllOther = {1:'White',2:'Black',3:'All Other'}
AgeGroups = {0:'Unknown',1:'<64',2:'64-69',3:'70-74',4:'75-79',5:'80-84',6:'>84'}
PTDCSTUS = {0:'Missing',1:'Still Patient',2:'DC Alive',3:'DC Deceased'}
# D_SSA is a look up table for the county id
SSA = pd.read_csv('e:/PatrickBernard/d_ssa.csv',names = ['county_id', 'county'])
#D_SSA.set_index('county_id',inplace = True)
SSA.sample(3)
# *** TODO update this to retrieve more than one state OR write this to retrieve one state and then call it
# more than once for each state.
subject_states = [19]#,25]
# compare_states = [] # this may be used latter to derive data for comparison regions
# with open(f'e:/PatrickBernard/{states[i]}MBSF2018.pkl', 'wb') as f:
for s in subject_states:
with open(f'e:/PatrickBernard/{states[s]}MBSF2018.pkl', 'rb') as f:
regional_data = pickle.load(f)
regional_data.info()
regional_data.columns
regional_data.sample(4)
features = ['Bene_ID', 'D_State', 'D_SSA', 'D_SSA2','D_DIED','D_Sex', 'D_Race','D_Age','D_TM', 'D_TM_Died',
'D_MA_Plan', 'D_MA_Plan_Died','D_HOS_PT', 'D_HOS_PROVIDER_ID','D_HOS_Died','D_HOS_UTIL_DAY',
'D_INPT_PT', 'D_INPT_PROVIDER_ID','D_INPT_DC_Status', 'D_INPT_Died','D_INPT_UTIL_DAY','D_SNF_PT',
'D_SNF_PROVIDER_ID','D_SNF_DC_Status', 'D_SNF_Died','D_SNF_UTIL_DAY','D_HHA_PT',
'D_HHA_PROVIDER_ID','D_HHA_PTDCSTUS', 'D_HHA_DIED','D_HHA_Days','RANDOM_NUM']
labels = ['D_HOS_PTDCSTUS']
# Import packages for classification
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
# impute missing data
regional_data.fillna(-999, inplace = True)
regional_data['RANDOM_NUM'] = randint(0, 10) # create a random number field
regional_data.sample(5)
# build the training and test data sets
X_data = regional_data[features]
y_data = regional_data['D_HOS_PTDCSTUS']
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3) # 70% training 30% test
model = RandomForestClassifier(n_estimators=100, max_depth = 6, min_samples_leaf = 3)
# fit the model to predict on Hospice discharge status
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_data, y_data, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
print(f'classes: {model.classes_}\nnum_features: {model.n_features_}\noutputs: {model.n_outputs_} ')
regional_data['hos_pred_dc'] = model.predict(regional_data[features])
regional_data[['D_HOS_PTDCSTUS','hos_pred_dc']][regional_data.D_HOS_PTDCSTUS!= -999].sample(10)
regional_data.describe().T
features_imp = pd.Series(model.feature_importances_, regional_data[features].columns)
features_imp.nlargest(34).plot(kind='barh')
plt.title(f'RF Feature Importance ({states[s]})')
# plt.savefig(f'{figPath}{states[s]}_RF_FeatureImportance.png',bbox_inches = 'tight')
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model, X_test, y_test,
display_labels=PTDCSTUS.values(),
cmap=plt.cm.Blues,
normalize=None)
plt.title(f'Hospice Discharge Status Confussion Matrics ({states[s]})')
features = ['Bene_ID', 'D_State', 'D_SSA', 'D_SSA2','D_DIED','D_Sex', 'D_Race','D_Age','D_TM', 'D_TM_Died',
'D_MA_Plan', 'D_MA_Plan_Died','D_HOS_PT', 'D_HOS_PROVIDER_ID','D_HOS_Died','D_HOS_UTIL_DAY',
'D_INPT_PT', 'D_INPT_PROVIDER_ID','D_INPT_DC_Status', 'D_INPT_Died','D_INPT_UTIL_DAY','D_SNF_PT',
'D_SNF_PROVIDER_ID','D_SNF_DC_Status', 'D_SNF_Died','D_SNF_UTIL_DAY','D_HHA_PT',
'D_HHA_PROVIDER_ID','D_HHA_PTDCSTUS', 'D_HHA_DIED','D_HHA_Days','RANDOM_NUM']
# load the training data set
with open(f'e:/PatrickBernard/training_set.pk2', 'rb') as f:
training_data = pickle.load(f)
# Create a random number of confounding
training_data['RANDOM_NUM'] = randint(0, 10)
# impute missing data
training_data.fillna(-999, inplace = True)
# build the training and test data sets
# X_data = training_data[feat_r]
# y_data = training_data['D_HOS_PTDCSTUS']
X_train, X_test, y_train, y_test = train_test_split(training_data[features], training_data['D_HOS_PTDCSTUS'], test_size=0.25) # 75% training 25% test
model_discharge = RandomForestClassifier(n_estimators=100, max_depth = 6, min_samples_leaf = 3)
# fit the model to predict on Hospice discharge status
model_discharge.fit(X_train,y_train)
y_pred=model_discharge.predict(X_test)
pickle.dump(model_discharge,open('e:/PatrickBernard/hosDischarge/rf_model_discharge.mdl','wb'))
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model_discharge, training_data[features], training_data['D_HOS_PTDCSTUS'], scoring='accuracy',
cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# Set the location of the figure
figPath = 'e:/PatrickBernard/hosDischarge/'
# Retrieve the model
model_dis = pickle.load(open(f'{figPath}rf_model_discharge.mdl','rb'))
for file in os.listdir(f'e:/PatrickBernard/'):
if file.endswith('.pkl')== False:
continue
else:
with open(f'e:/PatrickBernard/{file}', 'rb') as f:
regional_data = pickle.load(f)
print(file)
# Retrieve the state identifier
s = regional_data.iloc[0,1]
# impute missing data
regional_data.fillna(-999, inplace = True)
# Create a random number for confounding
regional_data['RANDOM_NUM'] = randint(0, 10)
#define the features
feat_r = ['Bene_ID', 'D_State', 'D_SSA', 'D_SSA2','D_DIED','D_Sex', 'D_Race','D_Age','D_TM', 'D_TM_Died',
'D_MA_Plan', 'D_MA_Plan_Died','D_HOS_PT', 'D_HOS_PROVIDER_ID','D_HOS_Died','D_HOS_UTIL_DAY',
'D_INPT_PT', 'D_INPT_PROVIDER_ID','D_INPT_DC_Status', 'D_INPT_Died','D_INPT_UTIL_DAY','D_SNF_PT',
'D_SNF_PROVIDER_ID','D_SNF_DC_Status', 'D_SNF_Died','D_SNF_UTIL_DAY','D_HHA_PT',
'D_HHA_PROVIDER_ID','D_HHA_PTDCSTUS', 'D_HHA_DIED','D_HHA_Days','RANDOM_NUM']
regional_data['PRED_HOS_PTDCSTUS'] = model_dis.predict(regional_data[feat_r])
plot_confusion_matrix(model_dis, regional_data[features], regional_data['D_HOS_PTDCSTUS'],
display_labels=PTDCSTUS.values(),
cmap=plt.cm.Blues,
normalize=None)
plt.title(f'Hospice Discharge Status Confussion Matrics ({states[s]})')
plt.savefig(f'{figPath}{states[s]}_DischargeStatusConfussionMatrics.png',bbox_inches = 'tight')
plt.close()
nn= file[:-4]
regional_data.to_csv(f'{figPath}pred_{nn}.csv',index=False)
If we use the ussage features in the data can we determine the race of the individual?
feat_r = ['Bene_ID', 'D_State', 'D_SSA', 'D_SSA2','D_DIED','D_Sex', 'D_Age','D_TM', 'D_TM_Died',
'D_MA_Plan', 'D_MA_Plan_Died','D_HOS_PT', 'D_HOS_PROVIDER_ID','D_HOS_Died','D_HOS_UTIL_DAY','D_HOS_PTDCSTUS',
'D_INPT_PT', 'D_INPT_PROVIDER_ID','D_INPT_DC_Status', 'D_INPT_Died','D_INPT_UTIL_DAY','D_SNF_PT',
'D_SNF_PROVIDER_ID','D_SNF_DC_Status', 'D_SNF_Died','D_SNF_UTIL_DAY','D_HHA_PT',
'D_HHA_PROVIDER_ID','D_HHA_PTDCSTUS', 'D_HHA_DIED','D_HHA_Days','N_Providers']
labels_r = ['D_Race']
Create a model from the traing data set.
# Set the location of the figure
figPath = 'e:/PatrickBernard/figures/RF_Class/'
# Retrieve the model
#model_r = pickle.load(open('e:/PatrickBernard/rf_model.mdl','rb'))
for file in os.listdir(f'e:/PatrickBernard/'):
if file.endswith('.pkl'))== False:
continue
else:
with open(f'e:/PatrickBernard/{file}', 'rb') as f:
regional_data = pickle.load(f)
# Retrieve the sate identifier
s = regional_data.iloc[0,1]
# impute missing data
regional_data.fillna(-999, inplace = True)
# Create a random number of confounding
regional_data['RANDOM_NUM'] = randint(0, 10)
#define the features
feat_r = ['Bene_ID', 'D_State', 'D_SSA', 'D_SSA2','D_DIED','D_Sex', 'D_Age','D_TM', 'D_TM_Died',
'D_MA_Plan', 'D_MA_Plan_Died','D_HOS_PT', 'D_HOS_PROVIDER_ID','D_HOS_Died','D_HOS_UTIL_DAY','D_HOS_PTDCSTUS',
'D_INPT_PT', 'D_INPT_PROVIDER_ID','D_INPT_DC_Status', 'D_INPT_Died','D_INPT_UTIL_DAY','D_SNF_PT',
'D_SNF_PROVIDER_ID','D_SNF_DC_Status', 'D_SNF_Died','D_SNF_UTIL_DAY','D_HHA_PT',
'D_HHA_PROVIDER_ID','D_HHA_PTDCSTUS', 'D_HHA_DIED','D_HHA_Days','N_Providers','RANDOM_NUM']
#labels_r = ['D_Race']
# build the training and test data sets
X_data = regional_data[feat_r]
y_data = regional_data['D_Race']
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3) # 70% training 30% test
# Define the model
model_r = RandomForestClassifier(n_estimators=100, max_depth = 6, min_samples_leaf = 3)
# fit the model to predict on Hospice discharge status
model_r.fit(X_train,y_train)
regional_data['Race_pred'] = model_r.predict(regional_data[feat_r])
feat_importances = pd.Series(model_r.feature_importances_, regional_data[feat_r].columns)
feat_importances.nlargest(32).plot(kind='barh')
plt.title(f'RF Feature Importance ({states[s]})')
plt.savefig(f'{figPath}{states[s]}_RF_FeatureImportance.png',bbox_inches = 'tight')
feat_importances['state'] = states[s]
# Create a classifiation utilization summary file with data for all the states
s_data = {'state':states[s], 'Tot_record':len(regional_data)}
# race_count = regional_data.groupby('Race_pred')['Race_pred'].count()
# sum_Race = {'Pred_unk_tot' : race_count[0], 'Pred_white_tot': race_count[1],'Pred_black_tot':race_count[2],'Pred_other_tot':race_count[3],
# 'Pred_asian_tot': race_count[4], 'Pred_Hispanic_tot': race_count[5], 'Pred_NA_Native_tot': race_count[6]}
# s_data.update(sum_Race)
for i in range (0, 7):
z_data = {f'Pred_{Race[i]}_alive': len(regional_data[(regional_data.Race_pred == i) & (regional_data.D_DIED == 0)].index),
f'Pred_{Race[i]}_deceased' : len(regional_data[(regional_data.Race_pred == i) & (regional_data.D_DIED == 1)].index),
f'Pred_{Race[i]}_HOS_alive' : len(regional_data[(regional_data.Race_pred == i) & (regional_data.D_HOS_Died == 0)].index),
f'Pred_{Race[i]}_HOS_deceased' : len(regional_data[(regional_data.Race_pred == i) & (regional_data.D_HOS_Died == 1)].index)}
if z_data[f'Pred_{Race[i]}_deceased'] == 0:
util = {f'Pred_{Race[i]}_HOS_Utilization' : 0}
else:
util = {f'Pred_{Race[i]}_HOS_Utilization' : z_data[f'Pred_{Race[i]}_HOS_deceased']/z_data[f'Pred_{Race[i]}_deceased']}
z_data.update(util)
s_data.update(z_data)
cols = ['state', 'Tot_record','Pred_Unknown_alive','Pred_Unknown_deceased','Pred_Unknown_HOS_alive', 'Pred_Unknown_HOS_deceased', \
'Pred_Unknown_HOS_Utilization', \
'Pred_White_alive','Pred_White_deceased', 'Pred_White_HOS_alive', 'Pred_White_HOS_deceased','Pred_White_HOS_Utilization', \
'Pred_Black_alive','Pred_Black_deceased', 'Pred_Black_HOS_alive','Pred_Black_HOS_deceased', 'Pred_Black_HOS_Utilization', \
'Pred_Other_alive','Pred_Other_deceased', 'Pred_Other_HOS_alive', 'Pred_Other_HOS_deceased', 'Pred_Other_HOS_Utilization', \
'Pred_Asian_alive','Pred_Asian_deceased', 'Pred_Asian_HOS_alive', 'Pred_Asian_HOS_deceased', 'Pred_Asian_HOS_Utilization', \
'Pred_Hispanic_alive','Pred_Hispanic_deceased', 'Pred_Hispanic_HOS_alive','Pred_Hispanic_HOS_deceased', \
'Pred_Hispanic_HOS_Utilization', \
'Pred_North_American_Native_alive','Pred_North_American Native_deceased', 'Pred_North_American_Native_HOS_alive', \
'Pred_North_American_Native_HOS_deceased','Pred_North_American_Native_HOS_Utilization']
# Open the pickle file and append the classification dictionary to the file
#TODO once all of the column are identified in the summary data order the data for clarity in the dataframe
try:
with open(f'e:/PatrickBernard/figures/RF_Class/RF_data.pkl', 'rb') as f:
RF_data = pickle.load(f)
except (OSError, IOError) as e:
RF_data = pd.DataFrame(columns = cols)
RF_data = RF_data.append(s_data, ignore_index=True)
with open(f'e:/PatrickBernard/figures/RF_Class/RF_data.pkl', 'wb') as f:
pickle.dump(RF_data, f)
# Save the RF Feature Importance data
try:
with open(f'e:/PatrickBernard/figures/RF_Class/RF_FeatImportance.pkl', 'rb') as f:
RF_FeatImportance = pickle.load(f)
except (OSError, IOError) as e:
RF_FeatImportance = pd.DataFrame()
RF_FeatImportance = RF_FeatImportance.append(feat_importances, ignore_index=True)
with open(f'e:/PatrickBernard/figures/RF_Class/RF_FeatImportance.pkl', 'wb') as f:
pickle.dump(RF_FeatImportance, f)
Retrieve summary data and plot the results
with open(f'e:/PatrickBernard/figures/RF_Class/RF_data.pkl', 'rb') as f:
summary_data = pickle.load(f)
summary_data
summary_data.describe().T
col1 = ['Tot_record', 'Pred_Unknown_alive','Pred_Unknown_deceased', 'Pred_Unknown_HOS_alive', 'Pred_Unknown_HOS_deceased', 'Pred_White_alive',
'Pred_White_deceased', 'Pred_White_HOS_alive', 'Pred_White_HOS_deceased', 'Pred_Black_alive','Pred_Black_deceased',
'Pred_Black_HOS_alive',
'Pred_Black_HOS_deceased', 'Pred_Other_alive','Pred_Other_deceased', 'Pred_Other_HOS_alive', 'Pred_Other_HOS_deceased', 'Pred_Asian_alive',
'Pred_Asian_deceased', 'Pred_Asian_HOS_alive', 'Pred_Asian_HOS_deceased', 'Pred_Hispanic_alive','Pred_Hispanic_deceased', 'Pred_Hispanic_HOS_alive',
'Pred_Hispanic_HOS_deceased','Pred_North_American_Native_HOS_alive', 'Pred_North_American_Native_HOS_deceased',
'Pred_North_American_Native_alive', 'Pred_North_American_Native_deceased']
col2 = ['Pred_Unknown_HOS_Utilization','Pred_White_HOS_Utilization','Pred_Black_HOS_Utilization','Pred_Other_HOS_Utilization','Pred_Asian_HOS_Utilization',
'Pred_Hispanic_HOS_Utilization','Pred_North_American_Native_HOS_Utilization']
summary_data[col1] = summary_data[col1].astype(int)
summary_data[col2] = summary_data[col2].astype(float)
summary_data.info()
summary_data
# TODO determine why there are duplicate columns
summary_data.drop_duplicates(subset=None, keep='first', inplace=True)
summary_data['pred_delta_w_b'] = summary_data['Pred_White_HOS_Utilization'] - summary_data['Pred_Black_HOS_Utilization']
summary_data.sort_values('state',inplace = True)
summary_data.sort_values('pred_delta_w_b', inplace = True)
temp = pd.melt(summary_data, id_vars='state', value_vars=['Pred_White_HOS_Utilization' ,'Pred_Black_HOS_Utilization','pred_delta_w_b'],
var_name='race', value_name='utilization_percent', col_level=None)
temp['utilization_percent'] = temp['utilization_percent']*100
temp.sample(4)
# Calculate the nation utilization numbers
Pred_white_nat_utilisation = summary_data.Pred_White_HOS_deceased.sum() / summary_data.Pred_White_deceased.sum()
Pred_black_nat_utilisation = summary_data.Pred_Black_HOS_deceased.sum() / summary_data.Pred_Black_deceased.sum()
#plot the Figure
plt.figure(figsize=(25,10))
sns.set_context('talk')
sns.set_style('whitegrid')
# create plot
sns.barplot(x = 'state', y ='utilization_percent' ,
hue = 'race',
data = temp,
palette = 'tab20',
#order = ['male', 'female'],
capsize = 0.05,
saturation = 8,
ci = None,
)
plt.text(26,92, f'Predictited Utilization\n White: {round(Pred_white_nat_utilisation*100,2)}\n Black: {round(Pred_black_nat_utilisation*100,2)} ',
style='normal',bbox={'facecolor':'white', 'alpha':0.75, 'pad':10},fontsize=16)
plt.legend(loc='best')
plt.tight_layout()
plt.title('(RFby state)Predicted Utilization by Race for 2018')
plt.xticks(rotation = 45, ha='right')
plt.savefig(f'e:/PatrickBernard/figures/RF_Class/RFbystate_PredictedUtilizationByState2018.png',bbox_inches = 'tight')
with open(f'e:/PatrickBernard/figures/RF_Class/RF_FeatImportance.pkl', 'rb') as f:
FeatImport = pickle.load(f)
FeatImport
# TODO determine why there are duplicate columns Fixed by testing for the corred file before processing
FeatImport.drop_duplicates(subset=None, keep='first', inplace=True)
FeatImport.sort_values('state',inplace = True)
FeatImport.describe().T
#FeatImport.mean().sort_values(inplace = True)
FeatImport = FeatImport.reindex(FeatImport.mean().sort_values(ascending = False).index, axis=1)
FeatImport.head(3)
plt.figure(figsize=(15,10))
sns.set_context('talk')
sns.set_style("whitegrid")
sns.boxplot(data= FeatImport)
plt.tight_layout()
plt.title('RF Feature Importance\n(ordered by mean)')
plt.xticks(rotation = 45, ha='right')
plt.savefig(f'e:/PatrickBernard/figures/RF_Class/RF_FeatureImportanceBoxPlot.png',bbox_inches = 'tight')
Build A mean-shift Classifier
this atempt was abandons because of computer resources. the classification did not complete.